home *** CD-ROM | disk | FTP | other *** search
- # Source Generated with Decompyle++
- # File: in.pyc (Python 2.4)
-
- '''E-mail fingerprint extraction library
-
- Client interface to fingerprint database needed for centralized
- ham/spam recognition.
- '''
- from __future__ import division
- import re
- import sys
- import types
-
- try:
- set
- except NameError:
- from sets import Set as set
-
- from spambayes import tokenizer
- from spamexperts.Options import options
-
- class Fingerprint(object):
- """This class is a factory for generating fingerprints from messages.
-
- As much as possible, text that has been added to a message to prevent
- fingerprint-type schemes (random text, whitespace, HTML) is removed
- before fingerprinting.
-
- To use, call Fingerprint.get_fingerprint() to get the fingerprint (to
- pass to the server as a query), and then Fingerprint.spamprob() with
- the query results to get a probability that the messages is spam.
-
- NOTE: fingerprint generation method is described in:
- Feng Zhou, Li Zhuang, Ben Y. Zhao, Ling Huang, Anthony D. Joseph,
- and John Kubiatowics. 'Approximate Object Location and Spam Filtering
- on Peer-to-Peer Systems'. Appears in Proc. of ACM/IFIP/USENIX
- Intl. Middleware Conf. (Middleware 2003).
- """
-
- def get_ridges(self, data, normalise):
- if normalise:
- data = self.normalise(data)
-
- L = options[('fingerprint', 'l')]
- data_length = len(data)
- if data_length < L:
- return [
- self._hash(data)]
-
- return [ self._hash(data[i:i + L]) for i in xrange(0, data_length, L) ]
-
-
- def _hash(chunk):
- '''Generate substrings, and hash them (converting to strings, and
- reversing).
- '''
- return str(abs(hash(chunk)))[::-1]
-
- _hash = staticmethod(_hash)
-
- def check_ridges(self, data, normalise, ridges):
- L = options[('fingerprint', 'l')]
- if normalise:
- data = data.get_payload()
- else:
- return []
- return [ (orig, self._hash(norm) in ridges) for orig, norm in self.split_and_norm(data, L) ]
-
-
- def split_and_norm(cls, data, required_length):
- orig_chunk = []
- norm_chunk = []
- in_a_row = False
- for c in data:
- orig_chunk.append(c)
- norm = cls.replace_re.sub(' ', c)
- if norm != ' ' or not in_a_row:
- norm_chunk.append(norm)
- if len(norm_chunk) == required_length:
- yield (''.join(orig_chunk), ''.join(norm_chunk))
- orig_chunk = []
- norm_chunk = []
-
- if norm != ' ':
- in_a_row = False
- else:
- in_a_row = True
- norm != ' '
-
- yield (''.join(orig_chunk), ''.join(norm_chunk))
-
- split_and_norm = classmethod(split_and_norm)
- replace_re = re.compile('\\s+')
- tokenizer = tokenizer.Tokenizer()
-
- def normalise(self, msg):
- if options[('fingerprint', 'use_tokenizer')]:
- return ' '.join(self.tokenizer.tokenize_body(msg))
-
- return self.replace_re.sub(' ', msg.get_payload())
-
-
- def get_body(self, msg):
- '''To get the body, we walk through the message, collecting all
- parts (not just text, although only text parts are normalised).
- '''
- yield (msg.get('Subject', ''), False)
- for part in msg.walk():
- if part.get_content_maintype() == 'text':
- yield (part, True)
- continue
- payload = part.get_payload()
- if not isinstance(payload, types.ListType):
- yield (payload, False)
- continue
-
-
-
- def get_fingerprint(self, msg):
- '''Generation of a fingerprint from an email message.
-
- The L parameter defines the length of collected the substrings.
- The N parameter defines the number of selected substrings.
- '''
- hash_substrings = []
- for section, normalise in self.get_body(msg):
- hash_substrings.extend(self.get_ridges(section, normalise))
-
- hash_substrings.sort()
- hash_substrings = hash_substrings[:options[('fingerprint', 'max-n')]]
- cfv = set(hash_substrings)
- if '' in cfv:
- cfv.remove('')
-
- if len(cfv) < options[('fingerprint', 'min-n')]:
- return set()
-
- return cfv
-
-
- def notate_fingerprint(self, msg, ridges):
- '''Returns a list of (data, matched). The idea is to then draw
- the message and change the colour if matched is True.'''
- notated = []
- for section, normalise in self.get_body(msg):
- notated.extend(self.check_ridges(section, normalise, ridges))
-
- return notated
-
-
- def spamprob(self, max_match_count, mail_fingerprints):
- '''
- Returns the spam probability for given message to be spam,
- given the number of fingerprints that were matched.
- '''
- total_count = len(mail_fingerprints)
- matching_count = int(max_match_count)
- if options[('globals', 'verbose')]:
- print >>sys.stderr, 'Fingerprint: Found %s matching hashes of %s total hashes' % (matching_count, total_count)
-
- if total_count != 0:
- f_prob = 0.5 + matching_count / total_count / 2
- else:
- f_prob = 0.5
- return f_prob
-
-
-